In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
In [2]:
sd = pd.read_csv("Sales Data.csv")

UnderStanding Data¶

In [3]:
sd
Out[3]:
Unnamed: 0 Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month Sales City Hour
0 0 295665 Macbook Pro Laptop 1 1700.00 2019-12-30 00:01:00 136 Church St, New York City, NY 10001 12 1700.00 New York City 0
1 1 295666 LG Washing Machine 1 600.00 2019-12-29 07:03:00 562 2nd St, New York City, NY 10001 12 600.00 New York City 7
2 2 295667 USB-C Charging Cable 1 11.95 2019-12-12 18:21:00 277 Main St, New York City, NY 10001 12 11.95 New York City 18
3 3 295668 27in FHD Monitor 1 149.99 2019-12-22 15:13:00 410 6th St, San Francisco, CA 94016 12 149.99 San Francisco 15
4 4 295669 USB-C Charging Cable 1 11.95 2019-12-18 12:38:00 43 Hill St, Atlanta, GA 30301 12 11.95 Atlanta 12
... ... ... ... ... ... ... ... ... ... ... ...
185945 13617 222905 AAA Batteries (4-pack) 1 2.99 2019-06-07 19:02:00 795 Pine St, Boston, MA 02215 6 2.99 Boston 19
185946 13618 222906 27in FHD Monitor 1 149.99 2019-06-01 19:29:00 495 North St, New York City, NY 10001 6 149.99 New York City 19
185947 13619 222907 USB-C Charging Cable 1 11.95 2019-06-22 18:57:00 319 Ridge St, San Francisco, CA 94016 6 11.95 San Francisco 18
185948 13620 222908 USB-C Charging Cable 1 11.95 2019-06-26 18:35:00 916 Main St, San Francisco, CA 94016 6 11.95 San Francisco 18
185949 13621 222909 AAA Batteries (4-pack) 1 2.99 2019-06-25 14:33:00 209 11th St, Atlanta, GA 30301 6 2.99 Atlanta 14

185950 rows × 11 columns

In [4]:
sd.head()
Out[4]:
Unnamed: 0 Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month Sales City Hour
0 0 295665 Macbook Pro Laptop 1 1700.00 2019-12-30 00:01:00 136 Church St, New York City, NY 10001 12 1700.00 New York City 0
1 1 295666 LG Washing Machine 1 600.00 2019-12-29 07:03:00 562 2nd St, New York City, NY 10001 12 600.00 New York City 7
2 2 295667 USB-C Charging Cable 1 11.95 2019-12-12 18:21:00 277 Main St, New York City, NY 10001 12 11.95 New York City 18
3 3 295668 27in FHD Monitor 1 149.99 2019-12-22 15:13:00 410 6th St, San Francisco, CA 94016 12 149.99 San Francisco 15
4 4 295669 USB-C Charging Cable 1 11.95 2019-12-18 12:38:00 43 Hill St, Atlanta, GA 30301 12 11.95 Atlanta 12
In [5]:
sd.describe()
Out[5]:
Unnamed: 0 Order ID Quantity Ordered Price Each Month Sales Hour
count 185950.000000 185950.000000 185950.000000 185950.000000 185950.000000 185950.000000 185950.000000
mean 8340.388475 230417.569379 1.124383 184.399735 7.059140 185.490917 14.413305
std 5450.554093 51512.737110 0.442793 332.731330 3.502996 332.919771 5.423416
min 0.000000 141234.000000 1.000000 2.990000 1.000000 2.990000 0.000000
25% 3894.000000 185831.250000 1.000000 11.950000 4.000000 11.950000 11.000000
50% 7786.000000 230367.500000 1.000000 14.950000 7.000000 14.950000 15.000000
75% 11872.000000 275035.750000 1.000000 150.000000 10.000000 150.000000 19.000000
max 25116.000000 319670.000000 9.000000 1700.000000 12.000000 3400.000000 23.000000
In [7]:
sd.columns
Out[7]:
Index(['Unnamed: 0', 'Order ID', 'Product', 'Quantity Ordered', 'Price Each',
       'Order Date', 'Purchase Address', 'Month', 'Sales', 'City', 'Hour'],
      dtype='object')
In [8]:
sd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        185950 non-null  int64  
 1   Order ID          185950 non-null  int64  
 2   Product           185950 non-null  object 
 3   Quantity Ordered  185950 non-null  int64  
 4   Price Each        185950 non-null  float64
 5   Order Date        185950 non-null  object 
 6   Purchase Address  185950 non-null  object 
 7   Month             185950 non-null  int64  
 8   Sales             185950 non-null  float64
 9   City              185950 non-null  object 
 10  Hour              185950 non-null  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 15.6+ MB
In [9]:
sd.count()
Out[9]:
Unnamed: 0          185950
Order ID            185950
Product             185950
Quantity Ordered    185950
Price Each          185950
Order Date          185950
Purchase Address    185950
Month               185950
Sales               185950
City                185950
Hour                185950
dtype: int64
In [10]:
sd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        185950 non-null  int64  
 1   Order ID          185950 non-null  int64  
 2   Product           185950 non-null  object 
 3   Quantity Ordered  185950 non-null  int64  
 4   Price Each        185950 non-null  float64
 5   Order Date        185950 non-null  object 
 6   Purchase Address  185950 non-null  object 
 7   Month             185950 non-null  int64  
 8   Sales             185950 non-null  float64
 9   City              185950 non-null  object 
 10  Hour              185950 non-null  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 15.6+ MB
In [11]:
sd['Order Date'] = pd.to_datetime(sd['Order Date'])
sd['Order Date']
Out[11]:
0        2019-12-30 00:01:00
1        2019-12-29 07:03:00
2        2019-12-12 18:21:00
3        2019-12-22 15:13:00
4        2019-12-18 12:38:00
                 ...        
185945   2019-06-07 19:02:00
185946   2019-06-01 19:29:00
185947   2019-06-22 18:57:00
185948   2019-06-26 18:35:00
185949   2019-06-25 14:33:00
Name: Order Date, Length: 185950, dtype: datetime64[ns]
In [12]:
sd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Unnamed: 0        185950 non-null  int64         
 1   Order ID          185950 non-null  int64         
 2   Product           185950 non-null  object        
 3   Quantity Ordered  185950 non-null  int64         
 4   Price Each        185950 non-null  float64       
 5   Order Date        185950 non-null  datetime64[ns]
 6   Purchase Address  185950 non-null  object        
 7   Month             185950 non-null  int64         
 8   Sales             185950 non-null  float64       
 9   City              185950 non-null  object        
 10  Hour              185950 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(5), object(3)
memory usage: 15.6+ MB
In [13]:
sd.isnull().sum()
Out[13]:
Unnamed: 0          0
Order ID            0
Product             0
Quantity Ordered    0
Price Each          0
Order Date          0
Purchase Address    0
Month               0
Sales               0
City                0
Hour                0
dtype: int64

Data Visualization¶

In [14]:
sd['Product'].unique()
Out[14]:
array(['Macbook Pro Laptop', 'LG Washing Machine', 'USB-C Charging Cable',
       '27in FHD Monitor', 'AA Batteries (4-pack)',
       'Bose SoundSport Headphones', 'AAA Batteries (4-pack)',
       'ThinkPad Laptop', 'Lightning Charging Cable', 'Google Phone',
       'Wired Headphones', 'Apple Airpods Headphones', 'Vareebadd Phone',
       'iPhone', '20in Monitor', '34in Ultrawide Monitor',
       'Flatscreen TV', '27in 4K Gaming Monitor', 'LG Dryer'],
      dtype=object)
In [15]:
Products= sd.groupby('Product')['Quantity Ordered'].sum().sort_values()
Products
Out[15]:
Product
LG Dryer                        646
LG Washing Machine              666
Vareebadd Phone                2068
20in Monitor                   4129
ThinkPad Laptop                4130
Macbook Pro Laptop             4728
Flatscreen TV                  4819
Google Phone                   5532
34in Ultrawide Monitor         6199
27in 4K Gaming Monitor         6244
iPhone                         6849
27in FHD Monitor               7550
Bose SoundSport Headphones    13457
Apple Airpods Headphones      15661
Wired Headphones              20557
Lightning Charging Cable      23217
USB-C Charging Cable          23975
AA Batteries (4-pack)         27635
AAA Batteries (4-pack)        31017
Name: Quantity Ordered, dtype: int64
In [16]:
sns.barplot(y=Products.index, x=Products.values).set(title='Quantity Ordered per Product')
plt.show()
In [17]:
sd['Year'] = pd.to_datetime(sd['Order Date']).dt.year
In [18]:
sd.head()
Out[18]:
Unnamed: 0 Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month Sales City Hour Year
0 0 295665 Macbook Pro Laptop 1 1700.00 2019-12-30 00:01:00 136 Church St, New York City, NY 10001 12 1700.00 New York City 0 2019
1 1 295666 LG Washing Machine 1 600.00 2019-12-29 07:03:00 562 2nd St, New York City, NY 10001 12 600.00 New York City 7 2019
2 2 295667 USB-C Charging Cable 1 11.95 2019-12-12 18:21:00 277 Main St, New York City, NY 10001 12 11.95 New York City 18 2019
3 3 295668 27in FHD Monitor 1 149.99 2019-12-22 15:13:00 410 6th St, San Francisco, CA 94016 12 149.99 San Francisco 15 2019
4 4 295669 USB-C Charging Cable 1 11.95 2019-12-18 12:38:00 43 Hill St, Atlanta, GA 30301 12 11.95 Atlanta 12 2019
In [22]:
month_df = sd.groupby(['Year','Month'])
Monthly_sales = pd.DataFrame(month_df['Sales'].sum(), columns = ['Sales'])
avg_order_month = month_df ['Sales'].mean()
avg_order_month = avg_order_month.reset_index()
Monthly_sales = Monthly_sales.reset_index()
Monthly_sales = Monthly_sales.loc[Monthly_sales['Year']==2019]
Monthly_sales['AOV'] = avg_order_month['Sales']
Monthly_sales.sort_values
Out[22]:
<bound method DataFrame.sort_values of     Year  Month       Sales         AOV
0   2019      1  1813586.44  187.450795
1   2019      2  2202022.42  183.884962
2   2019      3  2807100.38  185.250471
3   2019      4  3390670.24  185.495390
4   2019      5  3152606.75  190.305852
5   2019      6  2577802.26  190.187565
6   2019      7  2647775.76  185.249826
7   2019      8  2244467.88  187.648849
8   2019      9  2097560.13  180.497387
9   2019     10  3736726.88  184.238580
10  2019     11  3199603.20  182.074956
11  2019     12  4613443.34  184.655913>
In [23]:
plt.figure(figsize=(10, 6))
sns.barplot(data=Monthly_sales, x='Month', y='Sales')
plt.title('Every Month Sales')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.show()
In [21]:
fig = px.bar(data_frame=Monthly_sales, x='Month', y='Sales', title='Every Month Sales')
fig.show()
In [24]:
product_sales = sd.groupby(['Year','Product'])
product_sales = pd.DataFrame(product_sales["Sales"].sum(), columns=["Sales"])
product_sales = product_sales.reset_index()
product_sales 
Out[24]:
Year Product Sales
0 2019 20in Monitor 453818.74
1 2019 27in 4K Gaming Monitor 2434707.57
2 2019 27in FHD Monitor 1131974.53
3 2019 34in Ultrawide Monitor 2355558.01
4 2019 AA Batteries (4-pack) 106118.40
5 2019 AAA Batteries (4-pack) 92725.88
6 2019 Apple Airpods Headphones 2348550.00
7 2019 Bose SoundSport Headphones 1345265.46
8 2019 Flatscreen TV 1445400.00
9 2019 Google Phone 3318600.00
10 2019 LG Dryer 387600.00
11 2019 LG Washing Machine 399600.00
12 2019 Lightning Charging Cable 347004.45
13 2019 Macbook Pro Laptop 8035900.00
14 2019 ThinkPad Laptop 4127958.72
15 2019 USB-C Charging Cable 286453.45
16 2019 Vareebadd Phone 826800.00
17 2019 Wired Headphones 246430.47
18 2019 iPhone 4792900.00
19 2020 20in Monitor 329.97
20 2020 27in 4K Gaming Monitor 389.99
21 2020 27in FHD Monitor 449.97
22 2020 AAA Batteries (4-pack) 14.95
23 2020 Apple Airpods Headphones 600.00
24 2020 Bose SoundSport Headphones 299.97
25 2020 Flatscreen TV 300.00
26 2020 Google Phone 600.00
27 2020 Lightning Charging Cable 89.70
28 2020 Macbook Pro Laptop 1700.00
29 2020 ThinkPad Laptop 1999.98
30 2020 USB-C Charging Cable 47.80
31 2020 Vareebadd Phone 400.00
32 2020 Wired Headphones 47.96
33 2020 iPhone 1400.00
In [25]:
# Create a pie chart figure
fig = go.Figure(data=[go.Pie(labels=product_sales["Product"].loc[product_sales["Year"]==2019], values=product_sales["Sales"].loc[product_sales["Year"]==2019])])

# Set layout options
fig.update_layout(title='Popular Product for year 2019')

# Display the chart
fig.show()
In [26]:
Popular_Hour = sd.groupby(['Month','Hour'])
Popular_Hour = pd.DataFrame(Popular_Hour['Sales'].sum(), columns= ['Sales'])
Popular_Hour = Popular_Hour.reset_index()
Popular_Hour = Popular_Hour.loc[Popular_Hour["Month"]==12]
Popular_Hour
Out[26]:
Month Hour Sales
264 12 0 93795.21
265 12 1 63311.56
266 12 2 32250.44
267 12 3 16567.74
268 12 4 19626.14
269 12 5 33728.66
270 12 6 54690.49
271 12 7 120218.92
272 12 8 168953.52
273 12 9 217493.38
274 12 10 271169.35
275 12 11 300369.73
276 12 12 288275.69
277 12 13 294821.51
278 12 14 288622.89
279 12 15 269882.01
280 12 16 233328.75
281 12 17 290406.13
282 12 18 285893.25
283 12 19 323274.15
284 12 20 321235.30
285 12 21 255373.54
286 12 22 212241.78
287 12 23 157913.20
In [27]:
fig = go.Figure(data=go.Scatter(x=Popular_Hour["Hour"], y=Popular_Hour["Sales"]))

# Set layout options
fig.update_layout(title='Peak Hours in a Month')

# Display the chart
fig.show()
In [28]:
fig = px.bar(Popular_Hour, x='Hour', y='Sales', title=f'Peak Hours for Sales')
fig.show()
In [29]:
Area_Value = sd.groupby(['City'])
Area_Value = Area_Value['Sales'].sum().reset_index()
In [30]:
Area_Value
Out[30]:
City Sales
0 Atlanta 2795498.58
1 Austin 1819581.75
2 Boston 3661642.01
3 Dallas 2767975.40
4 Los Angeles 5452570.80
5 New York City 4664317.43
6 Portland 2320490.61
7 San Francisco 8262203.91
8 Seattle 2747755.48
In [31]:
import plotly.express as px
import pandas as pd


# Plotting cities and population
fig = px.scatter(Area_Value, x='City', y='Sales', text='City',
                 title='Cities and Sales')

# Customize the plot layout
fig.update_layout(
    xaxis_title='City',
    yaxis_title='Sales',
    hovermode='closest'
)

# Display the plot
fig.show()
In [32]:
m_sales=sd.groupby('Product')['Sales'].sum().nlargest(5).plot.bar(title='Best Product for Sales',figsize=(5,5))
m_sales
Out[32]:
<Axes: title={'center': 'Best Product for Sales'}, xlabel='Product'>
In [33]:
m_sales=sd.groupby('Product')['Sales'].sum().nsmallest(5).plot.bar(title='Lowest Product for Sales',figsize=(5,5))
m_sales
Out[33]:
<Axes: title={'center': 'Lowest Product for Sales'}, xlabel='Product'>
In [34]:
Top_City = sd.groupby('City')['Sales'].sum().nlargest(5)
In [35]:
Top_City
Out[35]:
City
 San Francisco    8262203.91
 Los Angeles      5452570.80
 New York City    4664317.43
 Boston           3661642.01
 Atlanta          2795498.58
Name: Sales, dtype: float64
In [36]:
sns.barplot(y=Top_City.index,x=Top_City).set(title='Top Five City Sales')
Out[36]:
[Text(0.5, 1.0, 'Top Five City Sales')]
In [ ]: